library(ngram)
library(digest)
library(marginaleffects)
library(fixest)
library(haven)
library(ggridges)
library(quanteda)
library(stm)
library(modelsummary)
library(kableExtra)
library(tidyverse)
options("modelsummary_format_numeric_latex" = "mathmode")

setwd('')
source('ggtheme_baselike.R')

# ---- Load Data ----
# sots-df.csv: speech-level dataset with governor covariates and partisan slant predictions
gov_covar_cw <- read_csv('sots-df.csv')
# state_policy_liberalism.dta: Caughey & Warshaw state policy liberalism estimates
cw <- read_dta('state_policy_liberalism.dta') %>%
    left_join(tibble(abb = datasets::state.abb, state = datasets::state.name))
# gov-congress-nominate.csv: NOMINATE scores for governors who previously served in Congress
gov_nominate <- read_csv('gov-congress-nominate.csv')
# gov_elex_results.csv: gubernatorial election results, 1960-2023
gov_elex_results <- read_csv('gov_elex_results.csv')
# missing-gov-party.csv: party IDs for governor-years missing from corpus (for balance tests)
missing_gov_covar <- read_csv('missing-gov-party.csv')
# census-regions.csv: Census region assignments for each state
regions <- read_csv('census-regions.csv')
# sots_sents_df_small.csv: sentence-level classifier predictions
sots_sents <- read_csv('sots_sents_df_small.csv')

# ---- Inline Statistics: Corpus Summary ----
# N speeches, N governors, N states — reported inline in the paper
# total number of speeches
nrow(gov_covar_cw)

# number of governors
gov_covar_cw %>% 
    left_join(gov_nominate) %>% 
    pull(governor) %>% 
    unique() %>% 
    length()

# number of states
gov_covar_cw$state %>% unique %>% length

# ---- Figure A1: Missingness Plot (Appendix A.1) ----
# Produces missingness_plot.pdf — tile plot of corpus coverage by state-year
# Also computes proportion missing (reported inline in Appendix A.1)
obs_grid_f <- expand_grid(year = 1960:2023, state = datasets::state.name) %>% 
    left_join(gov_covar_cw %>%
        filter((year >= 1962) & (party %in% c('D', 'R') & !is.na(text))) %>% 
        mutate(party = case_when(govlast == 'kaine' ~ 'D',
            govlast == 'sundquist' ~ 'R',
            TRUE ~ party)) %>% 
        select(year, state, party) %>% 
        unique() %>% 
        mutate(exist = 'Yes')) %>% 
    select(year, state, exist, party) %>% 
    mutate(exist = if_else(is.na(exist), 'No', exist))

obs_grid <- obs_grid_f %>% filter(year >= 1962)

obs_grid$state <- factor(obs_grid$state, levels = rev(sort(unique(obs_grid$state))))

# prop missing
round(table(obs_grid$exist)[2]/sum(table(obs_grid$exist)), 2)

miss_plot <- ggplot(obs_grid, aes(x = year, y = state, fill = exist)) +
  geom_tile() +
  scale_fill_manual(values = c('grey80', 'grey20')) +
  labs(x = "Year", y = "State", fill = "Speech in Corpus") +
  theme_baselike() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = 'bottom', legend.direction = 'horizontal') + 
  scale_x_continuous(expand = c(0,0), breaks = seq(1962, 2023, 5)) 

ggsave('missingness_plot.pdf', miss_plot, height = 8, width = 8)

# ---- Table A1: Missingness by Party and Region (Appendix A.1) ----
# Also computes correlation between missingness and year
gov_missing_grid <- missing_gov_covar %>%
    filter(year >= 1960 & party %in% c('R', 'D')) %>% 
    left_join(regions %>% rename(state = State)) %>% 
    left_join(obs_grid_f %>% select(state, year, exist)) %>% 
    left_join(gov_elex_results) %>% 
    left_join(cw %>% select(year, state, mass_cult_con_est, mass_econ_con_est))

# missingness by party and region
party_wide <- gov_missing_grid %>%
    group_by(party, exist) %>%
    summarise(n = n(), .groups = "drop") %>%
    group_by(party) %>%
    mutate(prop = n / sum(n)) %>%
    filter(exist == "No") %>%
    select(category = party, n_missing = n, prop_missing = prop) %>%
    mutate(type = "Party")

region_wide <- gov_missing_grid %>%
    group_by(Region, exist) %>%
    summarise(n = n(), .groups = "drop") %>%
    group_by(Region) %>%
    mutate(prop = n / sum(n)) %>%
    filter(exist == "No") %>%
    select(category = Region, n_missing = n, prop_missing = prop) %>%
    mutate(type = "Region")

bind_rows(party_wide, region_wide) %>%
    select(type, category, n_missing, prop_missing) %>%
    kable(format = "latex",
          booktabs = TRUE,
          digits = 2,
          col.names = c("", "Category", "N Missing", "Prop. Missing"),
          caption = "Missingness by Party and Region") %>%
    kable_styling(latex_options = c("hold_position", "scale_down"),
                      full_width = TRUE) %>% 
    collapse_rows(columns = 1, valign = "top")

round(cor(gov_missing_grid$year, ifelse(gov_missing_grid$exist == 'No', 1, 0)),2)

# ---- Table A2: Standardized Bias, Rolling Three-Year Windows (Appendix A.1) ----
# Loops over rolling 3-year windows (y-2 to y) and computes standardized bias
# for three variables: gov vote share, cultural ideology, economic ideology.
bias_df <- c()
for(y in 1962:2023){

    dfy <- gov_missing_grid %>% filter(year %in% (y-2):y)
    prop_missing <- mean(dfy$exist == 'No')

    for(var in c("gov_dempct", "mass_cult_con_est", "mass_econ_con_est")){

      observed_mean <- mean(dfy %>% filter(exist == 'Yes') %>% pull(var), na.rm = TRUE)
      complete_mean <- mean(dfy %>% pull(var), na.rm = TRUE)
      var_sd <- sd(dfy %>% pull(var), na.rm = TRUE)

     bias_df <- bind_rows(bias_df, 
        tibble(
            target_year = y,
            variable = var,
            prop_missing = prop_missing,
            bias = abs(observed_mean - complete_mean),
            bias_standardized = abs(observed_mean - complete_mean) / var_sd  # In SD units
          )
        )
    }

}

# Summary
bias_df %>%
    group_by(variable) %>%
    summarise(
      median_bias_std = median(abs(bias_standardized), na.rm = TRUE),
      pct_under_0.2sd = mean(abs(bias_standardized) < 0.2, na.rm = TRUE)
    ) %>% 
    mutate(variable = case_when(
            variable == 'gov_dempct' ~ 'Gov. Dem. Vote Share',
            variable == 'mass_cult_con_est' ~ 'Public Cultural Ideology',
            variable == 'mass_econ_con_est' ~ 'Public Econ. Ideology'
        )) %>% 
    kable(format = "latex",
          booktabs = TRUE,
          digits = 2,
          col.names = c("Variable", "Median Std. Bias", "Pct. Below 0.2 SD"),
          caption = "Standardized Bias for Rolling Three-Year Windows") %>%
    kable_styling(latex_options = c("hold_position")) 

# ---- Table A3: Missingness by Southern/Non-Southern Democrat, Pre-1994 (Appendix A.1) ----
gov_missing_grid %>%
    filter(party %in% c('R', 'D') & year < 1994) %>%
    mutate(south_dem = if_else(party == "D" & Region == "South", 'Southern Dem.', 'Non-Southern Dem.')) %>%
    group_by(south_dem, exist) %>%
    count() %>%
    group_by(south_dem) %>%
    mutate(prop = n / sum(n)) %>% 
    filter(exist == "No") %>%
    select(Category = south_dem, `N Missing` = n, `Prop. Missing` = prop) %>%
    kable(format = "latex",
          booktabs = TRUE,
          digits = 2,
          caption = "Missingness by Southern Democrat Status (Pre-1994)") %>%
    kable_styling(latex_options = c("hold_position"))

# ---- Table A4: Model Replication Controlling for Party/Region Imbalance (Appendix A.1) ----
# First computes party and region missingness bias per rolling 3-year window,
# then merges into the main df and runs the regression with bias controls
bias_df_reg <- c()
for(y in 1962:2023){

    dfy <- gov_missing_grid %>% filter(year %in% (y-2):y) %>% mutate(missing = if_else(exist == 'No', 1, 0)) 

    # for party differences
    party_bias <- dfy %>% 
        group_by(party) %>% 
        summarise(prop_missing = mean(missing)) %>% 
        pull(prop_missing) %>% 
        diff() %>% 
        abs()

    region_bias <- dfy %>% 
        group_by(Region) %>% 
        summarise(prop_missing = mean(missing),
            w = n()) %>% 
        mutate(w = w / sum(w)) %>% 
        summarise(bias = sum(w * abs(prop_missing - weighted.mean(prop_missing, w)))) %>% 
        pull(bias)

     bias_df_reg <- bind_rows(bias_df_reg, 
        tibble(
            year = y,
            party_bias = party_bias,
            region_bias = region_bias
          )
        )
    }

gap_df <- left_join(gov_covar_cw, bias_df_reg)
bias_controls <- feols(pr_rep_scale ~ (party +  rep20) * gingrich + running + election + unif + party_bias + region_bias | state + congress, gap_df, cluster = ~state)

c_map_bias <- c(
    'rep20' = 'Two-Party Republican Vote Share (20s)',
    'rep20:gingrich' = 'Two-Party Rep. Vote (20s) x Post-1994',
    'partyR' = 'Republican',
    'partyR:gingrich' = 'Republican x Post-1994',
 
    'party_bias' = 'Party Imbalance',
    'region_bias' = 'Region Imbalance',
    'election' = 'Election Year',
    'running' = 'Governor Running for Re-Election',
    'unif' = 'Unified State Government'
    )

modelsummary(list(bias_controls),
    coef_map = c_map_bias,
    stars = TRUE,
    gof_omit = 'AIC|BIC|FE|Std.',
    output = 'latex_tabular', 
    escape = FALSE)

# ---- Table 1: Most Republican and Democratic Sentences, 2023 (Main Text) ----
# Selects the 5 most extreme sentences from each end of the predicted probability
# distribution plus 3 near 0.50, for the 2023 speeches

# empirical range of observations
round(range(sots_sents$probability_class_1, na.rm = TRUE),3)

set.seed(42)
bind_rows(
    sots_sents %>% 
        filter(year == 2023) %>% 
        arrange(probability_class_1) %>% 
        select(sentence_id, state, party, sentence, probability_class_1) %>% 
        head(5),

    sots_sents %>% 
        filter(year == 2023 & probability_class_1 > 0.4999 & probability_class_1 < 0.5001) %>% 
        arrange(desc(probability_class_1)) %>% 
        select(sentence_id, state, party, sentence, probability_class_1) %>% 
        slice(1,5,6),

    sots_sents %>% 
        filter(year == 2023) %>% 
        arrange(desc(probability_class_1)) %>% 
        select(sentence_id, state, party, sentence, probability_class_1) %>%
        head(5) %>% 
        slice(5:1)
    ) %>% 
    left_join(tibble(state = datasets::state.name, state.abb = datasets::state.abb)) %>%
    # Create formatted state column with party
    mutate(State = paste0(state.abb, " (", party, ")"),
           `Pr(Rep.)` = round(probability_class_1, 2)) %>%
    select(State, Sentence = sentence, `Pr(Rep.)`)

# ---- Inline Stats + Table A5: Tail Decomposition (Appendix A.4) ----
# Produces inline statistics about tail distributions discussed in Appendix B.1
# (face validity), plus Table A5 (The Most Partisan Sentences Drive Speech-Level
# Partisanship). Sentences are grouped into upper/lower 5th percentile tails
# and a center band; regression shows tails' contribution to speech-level scores.
pr_region_coded <- sots_sents %>%
    mutate(prob_rank = percent_rank(probability_class_1)) %>% 
    mutate(pr_region = case_when(
            prob_rank >= 0.95 ~ 'high_tail',
            prob_rank <= 0.05 ~ 'low_tail',
            TRUE ~ 'central'
        ))

pr_region_coded %>% 
    group_by(pr_region) %>% 
    summarise(n = n()) %>% 
    mutate(pct = n/nrow(pr_region_coded))

speech_composition <- pr_region_coded %>% 
    group_by(speech_id, pr_region) %>% 
    summarise(n = n()) %>% 
    pivot_wider(names_from = pr_region, values_from = n, values_fill = list(n = 0)) %>% 
    left_join(gov_covar_cw %>% select(speech_id, party, predicted_class, year, gingrich, avg_probability_class_1))

speech_composition %>%
    group_by(predicted_class) %>%
    summarise(
        avg_low_tail = mean(low_tail, na.rm = TRUE),
        avg_high_tail = mean(high_tail, na.rm = TRUE),
        avg_center = mean(central, na.rm = TRUE)
    ) %>%
    kable(format = "latex",
          booktabs = TRUE,
          digits = 2,
          col.names = c("Predicted Class", "Avg Low Tail", "Avg High Tail", "Avg Center"),
          caption = "Distribution of Sentence Types by Predicted Class") %>%
    kable_styling(latex_options = c("hold_position"))

tails_var <- feols(I(avg_probability_class_1 * 100) ~ central + high_tail + low_tail, speech_composition)

c_map_var <- c(
    'high_tail' = 'Upper Tail',
    'low_tail' = 'Lower Tail',
    'central' = 'Center Band',
    '(Intercept)' = 'Intercept'
    )

modelsummary(list(tails_var),
    coef_map = c_map_var,
    stars = TRUE,
    output = 'latex_tabular', 
    escape = FALSE)

# ---- Inline Stats: Face Validity with Human Labeling (Appendix A.4) ----
# Computes RA accuracy, inter-rater agreement, and confusion matrix for
# pairwise sentence comparisons coded by two research assistants
# read true and ra codes
pw <- read_csv('pairwise_ra_codes_ans_20251216.csv')
ra1 <- read_csv('pairwise_ra_codes_20251216-s.csv')
ra2 <- read_csv('n_pairwise_Ra_codes.csv')

# reformat answers
pw_ans <- pw %>% 
    mutate(true_r = if_else(pr_1 > pr_2, 1, 2))

table(pw_ans$true_r, ra1$more_republican)

# ra accuracy
sum(pw_ans$true_r == ra1$more_republican)/250 # 0.76
sum(pw_ans$true_r == ra2$more_republican)/250 # 0.77

table(ra2$more_republican, ra1$more_republican)

# ra errors
ra1_err <- which(ra1$more_republican != pw_ans$true_r)
ra2_err <- which(ra2$more_republican != pw_ans$true_r)

# subset to ra overlap and code matches
ra_overlap <- ra1[which(ra1$more_republican == ra2$more_republican),]
pw_set <- ra_overlap %>% 
    left_join(pw_ans) %>% 
    mutate(ra_rep = if_else(more_republican == 1, 1, 0),
        true_r = if_else(true_r == 1, 1, 0),
        match = if_else(true_r == ra_rep, 1, 0))

# ra accuracy in matching set
sum(pw_set$match)/nrow(pw_set)
table(pw_set$ra_rep, pw_set$true_r)
binom.test(11, 24, p = 0.5)

# ---- Inline Stats: Permutation Test for Transcription Errors (Appendix A.5) ----
# Grid-based permutation test: for each (p, q) pair, randomly swaps characters
# in transcripts and re-classifies. Compares permuted accuracy to true accuracy
# and corpus-wide year-wise accuracy SD.

# permutation test df
perm_tests <- read_csv('perm-test.df')

# compute acc of each perm test
acc_tab <- perm_tests %>% 
    mutate(match = if_else(true_label == predicted_class, 1, 0)) %>% 
    group_by(p, q) %>% 
    summarise(acc = mean(match)) %>% 
    filter(q <= 0.3 & p <= 0.3)

# 2023 true accuracy
gov_covar_cw %>% 
    filter(year == 2023) %>% 
    pull(match) %>% 
    mean() %>% 
    round(2)

# summarize permutation acc
summary(acc_tab$acc)

# corpus-wide year-wise accuracy
year_avg_acc <- gov_covar_cw %>% 
    group_by(year) %>% 
    summarise(acc = mean(match)) %>% 
    pull(acc)

# sd of tests
round(sd(acc_tab$acc),3)
# true sd of corpus
round(sd(year_avg_acc),3)
# sd ratio
round(sd(acc_tab$acc)/sd(year_avg_acc),2)


# ---- Figure 1: Correlation Between NOMINATE and Slant Measure (Main Text) ----
# matched governor nominate scores
nominate_agg <- read_csv('gov-nominate-agg.csv')

round(cor(nominate_agg$avg_r, nominate_agg$nominate_dim1, use = 'pairwise.complete.obs'),2)

gov_nominate_cor <- ggplot(nominate_agg, aes(x = nominate_dim1, y = avg_r)) + 
    geom_point(aes(color = party)) + 
    geom_smooth(method = 'lm', color = 'black') + 
    labs(x = 'NOMINATE First Dimension', y = 'Avg. Probability Republican', color = 'Party') + 
    scale_color_manual(values = c(ggblue, ggred)) + 
    theme_baselike() 

ggsave('nominate_corr.png', gov_nominate_cor, height = 4, width = 7)

# ---- Figure 2: Ordered Partisan Slant, 2023 (Main Text) ----
# Produces ord_2023.pdf. Bootstrap CIs: 1000 resamples of sentence-level
# predictions per state, then extracts 2.5th/97.5th percentile bounds.
ssc2023 <- sots_sents %>% filter(year == 2023)
boot_df <- c()
set.seed(20250924)
for(st in unique(ssc2023$state)){
    print(st)
    st_i <- ssc2023 %>% filter(state == st)

    boot_i <- c()
    for(i in 1:1000){
        st_boot <- st_i %>% sample_n(nrow(st_i), replace = TRUE)
        boot_i <- c(boot_i, mean(st_boot$probability_class_1, na.rm = TRUE))
    }

    row_st_i <- tibble(
        state = st,
        est = gov_covar_cw %>% filter(year == 2023 & state == st) %>% pull(avg_probability_class_1),
        low = quantile(boot_i, 0.025),
        high = quantile(boot_i, 0.975),
        party = gov_covar_cw %>% filter(year == 2023 & state == st) %>% pull(party)
    )

    boot_df <- bind_rows(boot_df, row_st_i)

}
    
bootstrap_2023 <- ggplot(boot_df, aes(x = est, y = reorder(state, est), color = party)) + 
    geom_point() + 
    geom_errorbarh(aes(xmin = low, xmax = high), height = 0) + 
    labs(x = 'Partisan Slant (D to R)', y = '', color = "Governor's Party") + 
    scale_color_manual(values = c(ggblue, ggred)) +
    theme_baselike() + 
    theme(legend.position = 'bottom', legend.direction = 'horizontal') + 
    geom_vline(xintercept = 0.5, linetype = 'dashed')

ggsave('ord_2023.png', bootstrap_2023, height = 8, width = 7)

# ---- Tables B1–B2: Policy vs. Non-Policy Sentences by Governor Type (Appendix B.1) ----
# Sub-section 1: Classify 2023 governors into bins (strong D / moderate / strong R)
# based on whether bootstrap CIs from Figure 2 cross 0.5
bins <- boot_df %>%
    mutate(bin = case_when(
            low < 0.5 & high > 0.5 ~ 'M',
            low > 0.5 ~ 'R',
            high < 0.5 ~ 'D'
        ))

# Sub-section 2: CAP keyword dictionary (from Eshima, Imai, & Sasaki 2024)
# Used to classify sentences as policy vs. non-policy
cap_keywords <- list(
    government_operations = c("administrative", "advertising", "appointment", 
        "attack", "auditing", "branch", "campaign", "capital", "census", "city", 
        "coin", "collection", "currency", "mail", "medal", "mint", "nomination", 
        "post", "postal", "registration", "statistic", "terrorist", "victim", 
        "voter"),

    public_lands = c("fire", "flood", "forest", "grazing", "historic", 
        "indigenous", "land", "livestock", "natural", "parks", "recreation", 
        "resource", "site", "staff", "territorial", "territorie", "water"),

    defense = c("armed", "base", "capability", "civilian", "compliance",
        "contractor", "coordination", "damage", "equipment", "foreign", 
        "homeland", "installation", "intelligence", "material", "military", 
        "nuclear", "operation", "personnel", "procurement", "reserve",
        "security", "services", "weapon"),

    domestic_commerce = c("account", "accounting", "bankruptcy", "business", 
        "card", "commerce", "commercial", "commodity","consumer", "cost", 
        "credit", "disaster", "finance", "financial", "fraud", "industry", 
        "insurance","investment", "management", "mortgage", "patent", "promote", 
        "property", "relief", "security"),

    law_crime = c("abuse", "border", "code", "combat", "court", "crime", 
        "criminal", "custom", "cyber", "drug", "enforcement", "family", "fine", 
        "judiciary", "justice", "juvenile", "legal", "penalty", "police",
        "prison", "release", "representation", "sexual", "terrorism", "violence"),

    health = c("abuse", "alcohol", "care", "clinical", "cost", "cover",
        "coverage", "disease", "drug", "health", "insurance", "insurer", 
        "liability", "license", "medical", "mental", "pay", "payment", 
        "prescription", "prevention", "provider", "rehabilitation", "supply",
        "tobacco", "treatment"),

    international_affairs = c("aid", "assessment", "associate", "citizen",
        "combat", "committee", "convention", "country", "cross", "develop",
        "directly", "foreign", "human", "international", "monetary", "ocean",
        "region", "regional", "sea", "target", "terrorism", "treaty", "union",
        "world"),

    transportation = c("air", "airport", "aviation", "channel", "construction",
        "deployment", "freight", "highway", "infrastructure", "inland",
        "maintenance", "maritime", "mass", "pilot", "rail", "railroad", "ship",
        "traffic", "transportation", "travel", "waterway"),

    macroeconomics = c("bank", "budget", "budgeting", "central", "cost",
        "deficit", "growth", "industrial", "inflation", "interest",
        "macroeconomic", "manufacturing", "monetary", "price", "revitalization",
        "tax", "treasury"),

    environment = c("alternative", "asbestos", "chemical", "climate",
        "conservation", "disposal", "drinking", "endanger", "environment",
        "environmental", "hazardous", "laboratory", "performance", "pollution",
        "protection", "resource", "solid", "specie", "supply", "toxic", "waste",
        "wastewater", "water", "wildlife"),

    education = c("adult", "area", "bilingual", "college", "education",
        "educational", "elementary", "excellence", "handicapped", "improve",
        "language", "literacy", "loan", "mentally", "need", "outcome",
        "physically", "primary", "school", "schools", "secondary", "skill",
        "student", "university", "vocational"),

    energy = c("alternative", "biofuel", "clean", "coal", "conservation",
        "drilling", "electrical", "electricity", "energy", "gas",
        "gasification", "gasoline", "geothermal", "hydrogen", "hydropower",
        "natural", "nuclear", "oil", "power", "production", "renewable",
        "shortage", "spill", "utility", "vehicle"),

    technology = c("broadcast", "communication", "computer", "cooperation",
        "encourage", "exploration", "forecast", "form", "geological",
        "internet", "publishing", "radio", "research", "satellite", "science",
        "space", "speed", "survey", "technology", "telecommunication",
        "telecommunications", "telephone", "television", "transfer", "weather"),

    labor = c("bargaining", "benefit", "compensation", "debt", "employee",
        "employer", "employment", "fair", "injury", "insurance", "job",
        "labor", "minimum", "pension", "protection", "retirement", "standard",
        "training", "unemployment", "union", "wage", "work", "worker",
        "workforce", "youth"),

    foreign_trade = c("agreement", "balance", "barrier", "competitiveness",
        "dispute", "exchange", "export", "foreign", "import", "international",
        "negotiation", "private", "productivity", "subsidy", "tariff", "trade",
        "treaty"),

    civil_rights = c("abortion", "age", "civil", "contract", "discrimination",
        "ethnic", "expression", "franchise", "freedom", "gender", "group",
        "information", "mandatory", "minority", "participation", "party",
        "privacy", "racial", "religious", "right", "rights", "sex", "sexual",
        "speech", "voting"),

    social_welfare = c("aid", "alleviate", "assess", "assistance",
        "association", "care", "charity", "child", "disability", "disable",
        "elderly", "family", "income", "leave", "parental", "physical",
        "poverty", "social", "volunteer", "welfare", "youth"),

    agriculture = c("agricultural", "agriculture", "animal", "crop", "farmer",
        "fish", "fishery", "food", "inspection", "labeling", "market", "pest",
        "pesticide", "product", "rancher", "seafood", "subsidy"),

    housing = c("affordability", "community", "economic", "family", "handicap",
        "homeless", "homelessness", "housing", "income", "neighborhood",
        "rural", "urban", "veteran"),

    immigration = c("refugee", "citizenship", "immigration"),

    culture = c("culture", "cultural")
    )

all_keywords <- unique(unlist(cap_keywords, use.names = FALSE))
pattern <- paste0("\\b(", paste(all_keywords, collapse = "|"), ")")

# Sub-section 3: Match sentences to policy keywords and compute Table A6
sots_2023_bins <- sots_sents %>%
    filter(year == 2023) %>%
    left_join(bins %>% select(state, bin)) %>% 
    mutate(is_policy = if_else(str_detect(tolower(sentence), pattern), 1, 0))

# Create formatted table for policy vs non-policy by governor type
policy_table <- sots_2023_bins %>%
    group_by(bin, is_policy) %>%
    summarise(avg_slant = mean(probability_class_1, na.rm = TRUE),
        n_sentences = n(),
        .groups = 'drop') %>%
    left_join(sots_2023_bins %>%
        group_by(bin) %>%
        summarise(n_tot = n())) %>%
    mutate(prop_sentences = n_sentences / n_tot) %>%
    mutate(
        Governor_Type = case_when(bin == 'D' ~ 'Strong Democrat',
                                   bin == 'M' ~ 'Moderate',
                                   bin == 'R' ~ 'Strong Republican'),
        Sentence_Type = if_else(is_policy == 1, 'Policy', 'Non-Policy'),
        Proportion = round(prop_sentences, 2),
        Slant = round(avg_slant, 2)
    ) %>%
    select(Governor_Type, Sentence_Type, Proportion, Slant) %>%
    pivot_wider(
        names_from = Sentence_Type,
        values_from = c(Proportion, Slant),
        names_glue = "{Sentence_Type}_{.value}"
    ) %>%
    select(Governor_Type, Policy_Proportion, Policy_Slant, `Non-Policy_Proportion`, `Non-Policy_Slant`) %>%
    rename(
        "Governor Type" = Governor_Type,
        "Prop." = Policy_Proportion,
        "Slant" = Policy_Slant,
        "Prop. " = `Non-Policy_Proportion`,
        "Slant " = `Non-Policy_Slant`
    )

# Generate LaTeX table
policy_table %>%
    kable(format = "latex",
          booktabs = TRUE,
          caption = "Policy and Non-Policy Sentence Composition by Governor Type, 2023",
          label = "tab:policy_composition",
          escape = FALSE) %>%
    kable_styling(latex_options = c("hold_position")) %>%
    add_header_above(c(" " = 1, "Policy" = 2, "Non-Policy" = 2)) %>%
    row_spec(0, bold = TRUE)


# Create LaTeX table of example sentences by governor type and policy area
example_sentences <- bind_rows(
    # Education examples
    sots_2023_bins %>% filter(sentence_id == 'e12f9d12') %>%
        select(bin, sentence, probability_class_1),
    sots_2023_bins %>% filter(sentence_id == 'a27fa486') %>%
        select(bin, sentence, probability_class_1),
    sots_2023_bins %>% filter(sentence_id == '03ca4de3') %>%
        select(bin, sentence, probability_class_1),

    # Healthcare examples
    sots_2023_bins %>% filter(sentence_id == '4da8c640') %>%
        select(bin, sentence, probability_class_1),
    sots_2023_bins %>% filter(sentence_id == '27a0883d') %>%
        select(bin, sentence, probability_class_1),
    sots_2023_bins %>% filter(sentence_id == '06b69789') %>%
        select(bin, sentence, probability_class_1)
) %>%
    mutate(
        Governor_Type = case_when(bin == 'D' ~ 'Strong Democrat',
                                   bin == 'M' ~ 'Moderate',
                                   bin == 'R' ~ 'Strong Republican'),
        `Pr(Rep.)` = round(probability_class_1, 2)
    ) %>%
    select(Governor_Type, sentence, `Pr(Rep.)`) %>%
    rename(
        "Governor Type" = Governor_Type,
        "Sentence" = sentence
    )

# Generate LaTeX table
example_sentences %>%
    kable(format = "latex",
          booktabs = TRUE,
          caption = "Example Sentences by Governor Type, 2023",
          label = "tab:example_sentences",
          escape = TRUE) %>%
    kable_styling(latex_options = c("hold_position")) %>%
    column_spec(2, width = "10cm") %>%
    row_spec(0, bold = TRUE)

# ---- Figure 3: Polarization in SOTS Speeches Over Time (Main Text) ----
# Produces ridge_plot.pdf — density ridges by year, colored by party
pol_ridge_plot <- ggplot(gov_covar_cw, aes(x = m_class, y = reorder(as.factor(year),rev(year)), fill = party)) +
    geom_density_ridges(alpha = 0.5) + 
    labs(x = 'Probability Republican', y = '', fill = 'Party') + 
    scale_fill_manual(values = c(ggblue, ggred)) + 
    theme_baselike() + 
    scale_y_discrete(expand = c(0.04,0.01)) + 
    coord_cartesian(ylim = c(2, 62))

ggsave('ridge_plot.png', pol_ridge_plot, height = 10, width = 6)


# ---- Figure B1: Proportion Crossing Opposing Party Median (Appendix B.1) ----
# Produces overlap_trend.pdf — for each year, computes the average proportion of
# governors whose slant crosses the opposing party's median
overlap_df <- c()
for(y in 1962:2023){

    d_y <- gov_covar_cw %>% filter(party == 'D' & year == y)
    r_y <- gov_covar_cw %>% filter(party == 'R' & year == y)

    dem_median <- median(d_y %>% pull(avg_probability_class_1))
    rep_median <- median(r_y %>% pull(avg_probability_class_1))

    ovrlp <- (mean(d_y$avg_probability_class_1 > rep_median) + 
             mean(r_y$avg_probability_class_1 < dem_median)) / 2
 
    overlap_df <- bind_rows(overlap_df, tibble(year = y, overlap = ovrlp))

}

overlap_df %>% filter(year < 2000) %>% pull(overlap) %>% summary
overlap_df %>% filter(year >= 2000) %>% pull(overlap) %>% summary

overlap_df[which(overlap_df$overlap > 0.1 & overlap_df$year >= 2004),]

overlap_over_time <- ggplot(overlap_df, aes(x = year, y = overlap)) +
  geom_point() +
  geom_smooth(method = 'loess', se = FALSE, color = 'black') +
  labs(y = "Average Proportion of Partisans\nCrossing Opposing Party Median", x = "Year") + 
  theme_baselike()

ggsave('overlap_trend.pdf', overlap_over_time, height = 4, width = 8)

# ---- Figure 4: Partisan Change in SOTS Speeches Over Time (Main Text) ----
# Produces state_trend_plot.pdf — spaghetti plot with VA, WA, WY highlighted

highlight_states <- c("Virginia", "Washington", "Wyoming")

# Create endpoint data for state labels
endpoint_data <- gov_covar_cw %>%
  filter(state %in% highlight_states) %>%
  group_by(state) %>%
  filter(year == max(year)) %>%
  ungroup() %>%
  mutate(state_abbr = case_when(
    state == "Virginia" ~ "VA",
    state == "Washington" ~ "WA",
    state == "Wyoming" ~ "WY"
  ))

state_trend_plot <- ggplot() +
  # Background lines for all states
  geom_line(
    data = gov_covar_cw,
    aes(x = year, y = m_class, group = state),
    color = "gray70", alpha = 0.6
  ) +
  # Highlight lines for the chosen states
  geom_line(
    data = filter(gov_covar_cw, state %in% highlight_states),
    aes(x = year, y = m_class, color = state, linetype = state),
    size = .75
  ) +
  annotate("text", x = 1963, y = 0.22, label = "All other states (gray)",
    color = "gray50", size = 3, hjust = 0, fontface = "italic") +
  # Add endpoint points for clarity
  geom_point(
    data = endpoint_data,
    aes(x = year, y = m_class),
    size = 2.5,
    shape = 21,
    fill = "white",
    color = "black",
    stroke = 1
  ) +
  # Add state abbreviation labels at line endpoints
  geom_text(
    data = endpoint_data %>%
      mutate(
        nudge_y_val = case_when(
          state == "Virginia" ~ -0.015,      # Push VA down
          state == "Washington" ~ 0,         # Keep WA centered
          state == "Wyoming" ~ 0.015         # Push WY up
        )
      ),
    aes(x = year, y = m_class + nudge_y_val, label = state_abbr),
    nudge_x = 2.5,
    hjust = 1,
    size = 3.5,
    fontface = "bold"
  ) +
  scale_color_manual(values = c("Virginia" = "black", "Washington" = "black", "Wyoming" = "black")) +
  scale_linetype_manual(values = c("Virginia" = "solid", "Washington" = "dashed", "Wyoming" = "dotted")) +
  scale_x_continuous(breaks = seq(1960, 2025, by = 5)) +
  theme_minimal() +
  labs(x = "Year", y = "Probability Republican (Mean Centered)", color = "State", linetype = "State") +
  geom_hline(yintercept = 0, linetype = 'dashed') +
  theme_baselike() + 
  theme(legend.position = 'bottom', legend.direction = 'horizontal', legend.key.width = unit(1.5, "cm"))+ 
  guides(
    color    = guide_legend(override.aes = list(size = 2)),
    linetype = guide_legend(override.aes = list(size = 2))
  ) 

ggsave('state_trend_plot.png', state_trend_plot, height = 4, width = 8)

# ---- Table 2: Main Regression Results (Main Text) ----
# Two models: (1) main effects only, (2) interaction with post-1994 indicator
pr <- feols(pr_rep_scale ~ (party +  rep20) + running + election + unif | state + year, gov_covar_cw, cluster = ~state)
int_pr <- feols(pr_rep_scale ~ (party +  rep20) * gingrich + running + election + unif | state + year, gov_covar_cw, cluster = ~state)

c_map <- c(
    'rep20' = 'Two-Party Republican Vote Share (20s)',
    'rep20:gingrich' = 'Two-Party Rep. Vote (20s) x Post-1994',
    'partyR' = 'Republican',
    'partyR:gingrich' = 'Republican x Post-1994',
 
    'gov_dempct' = 'Governor Democratic Vote Share',
    'election' = 'Election Year',
    'running' = 'Governor Running for Re-Election',
    'unif' = 'Unified State Government'
    )

modelsummary(list(pr, int_pr),
    coef_map = c_map,
    stars = TRUE,
    gof_omit = 'AIC|BIC|FE|Std.',
    output = 'latex_tabular',
    escape = FALSE) 

# ---- Figure 5: Marginal Effects (Main Text) ----
# Produces me_plot.pdf — marginal effects from Table 2, Column 2
me_party <- slopes(int_pr, variables = 'party', newdata = datagrid(gingrich = c(0,1)))
me_vote <- slopes(int_pr, variables = 'rep20', newdata = datagrid(gingrich = c(0,1)))
mes <- bind_rows(me_party, me_vote) %>% 
    mutate(gingrich2 = if_else(gingrich == 1, 'Post-1994', 'Pre-1995'),
        gingrich2 = factor(gingrich2, levels = c('Pre-1995', 'Post-1994')),
        term2 = if_else(term == 'party', 'Party (R - D)', 'Rep. Vote (+20)'))

me_int_plot <- ggplot(mes, aes(x = term2, y = estimate, shape = gingrich2)) + 
    geom_point(position = position_dodge(0.3), size = 2) +
    geom_errorbar(aes(ymin = conf.low, ymax = conf.high), width = 0, position = position_dodge(0.3)) +
    geom_hline(yintercept = 0, linetype = 'dashed') + 
    theme_baselike() +
    labs(x = '', y = 'Marginal Effect on SOTS Republican\nPartisan Slant (Std. Dev. Scale)', shape = '') 

ggsave('me_plot.png', me_int_plot, height = 4, width = 8)

sd(gov_covar_cw$avg_probability_class_1)
sd(gov_covar_cw$avg_probability_class_1) * 0.6
sd(gov_covar_cw$avg_probability_class_1) * 0.96

# ---- Inline Stats: Cluster-Bootstrapped Marginal Effect Differences (Main Text) ----
# 1000 iterations of cluster bootstrap at the state level to compute CIs for
# differences in marginal effects (party over time, constituency over time,
# party vs. constituency post-1994). Reported inline in the results section.
# computationally intensive, uncomment to run

# set.seed(20250403)
# vec <- vec2 <- vec3 <- c()
# for(i in 1:1000){
#     if(i %% 25 == 0){print(i)}
#     state_vec <- sample(unique(gov_covar_cw$state), 50, replace = TRUE)

#     state_df <- c()
#     for(st in state_vec){
#         state_tmp <- gov_covar_cw %>% filter(state == st)
#         state_df <- bind_rows(state_df, state_tmp)
#     }

#     d <- feols(pr_rep_scale ~ (party +  rep20) * gingrich + running + election + unif | state + year, state_df, cluster = ~state)
#     mep <- slopes(d, variables = 'party', newdata = datagrid(gingrich = c(0,1)))
#     mec <- slopes(d, variables = 'rep20', newdata = datagrid(gingrich = c(0,1)))
#     vec <- c(vec, mep$estimate[2] - mep$estimate[1])
#     vec2 <- c(vec2, mep$estimate[2] - mec$estimate[2])
#     vec3 <- c(vec3, mec$estimate[2] - mec$estimate[1])


# }

# #differences, party over time
# quantile(vec, c(0.025, 0.5, 0.975))
# #differences, constituency over time
# quantile(vec3, c(0.025, 0.5, 0.975))
# # differences, party vs constituency post-1994
# quantile(vec2, c(0.025, 0.5, 0.975))

# sd(gov_covar_cw$avg_probability_class_1)

# ---- Table B3: Time-Varying Trends are Robust (Appendix B.2) ----
# Three alternative time specifications: (1) linear trend, (2) decade
# interactions, (3) political realignment era interactions
linear_time <- feols(pr_rep_scale ~ (party + rep20) * I(year - 1962) + running + election + unif | state + congress, gov_covar_cw, cluster = ~state)
decade_int <- feols(pr_rep_scale ~ (party + rep20) * as.factor(decade) + running + election + unif | state + year, gov_covar_cw, cluster = ~state)
realign_int <- feols(pr_rep_scale ~ (party + rep20) * realign + running + election + unif | state + year, gov_covar_cw, cluster = ~state)

c_map_decade <- c(
    'partyR' = 'Republican',
    'rep20' = 'Two-Party Republican Vote Share (10s)',

    'I(year - 1962)' = 'Year',

    'partyR:I(year - 1962)' = 'Republican x Year',
    'rep20:I(year - 1962)' = 'Rep. Vote x Year',

    'partyR:as.factor(decade)1970' = 'Republican x 1970', 
    'partyR:as.factor(decade)1980' = 'Republican x 1980',
    'partyR:as.factor(decade)1990' = 'Republican x 1990',
    'partyR:as.factor(decade)2000' = 'Republican x 2000',
    'partyR:as.factor(decade)2010' = 'Republican x 2010',
    'partyR:as.factor(decade)2020' = 'Republican x 2020',

    'rep20:as.factor(decade)1970' = 'Rep. Vote x 1970',
    'rep20:as.factor(decade)1980' = 'Rep. Vote x 1980',
    'rep20:as.factor(decade)1990' = 'Rep. Vote x 1990',
    'rep20:as.factor(decade)2000' = 'Rep. Vote x 2000',
    'rep20:as.factor(decade)2010' = 'Rep. Vote x 2010',
    'rep20:as.factor(decade)2020' = 'Rep. Vote x 2020',

    'partyR:realign1981-1994' = 'Republican x 1981-1994',
    'partyR:realign1995-2016' = 'Republican x 1995-2016',
    'partyR:realign2017+' = 'Republican x 2017+',

    'rep20:realign1981-1994' = 'Rep. Vote x 1981-1994',
    'rep20:realign1995-2016' = 'Rep. Vote x 1995-2016',
    'rep20:realign2017+' = 'Rep. Vote x 2017+',

    'gov_dempct' = 'Governor Democratic Vote Share',
    'election' = 'Election Year',
    'running' = 'Governor Running for Re-Election',
    'unif' = 'Unified State Government'
)

modelsummary(list(linear_time, decade_int, realign_int),
    coef_map = c_map_decade,
    stars = TRUE,
    gof_omit = 'AIC|BIC|FE|Std.',
    output = 'latex_tabular', 
    escape = FALSE)

# ---- Table B4: Party Split Models — Dem Only / Rep Only (Appendix B.2) ----
c6_dem <- feols(pr_rep_scale ~ rep20 * gingrich + running + election + unif | state + year, gov_covar_cw %>% filter(party == 'D'), cluster = ~state)
c6_rep <- feols(pr_rep_scale ~ rep20 * gingrich + running + election + unif | state + year, gov_covar_cw %>% filter(party == 'R'), cluster = ~state)

modelsummary(list(c6_dem, c6_rep),
    coef_map = c_map,
    stars = TRUE,
    gof_omit = 'AIC|BIC|FE|Std.',
    output = 'latex_tabular', 
    escape = FALSE)

# ---- Table B5: Replication with Additional Controls (Appendix B.2) ----
# Controls: policy mood, unemployment, per capita income, primary type
int_pr_ec <- feols(pr_rep_scale ~ (party +  rep20) * gingrich + running + election + unif + primary_type + mood + unemployment_rate + pcpi_real_2023 | state + congress, gov_covar_cw, cluster = ~state)
slopes(int_pr_ec, variables = 'rep20',  newdata = datagrid(gingrich = c(0,1)))

c_map <- c(
    'rep20' = 'Two-Party Republican Vote Share (20s)',
    'rep20:gingrich' = 'Two-Party Rep. Vote (20s) x Post-1994',
    'partyR' = 'Republican',
    'partyR:gingrich' = 'Republican x Post-1994',
 
    'mood' = 'Policy Mood',
    'unemployment_rate' = 'Unemployment Rate',
    'pcpi_real_2023' = 'Per Capita Income (2023 Dollars)',
    'primary_typeopen' = 'Open Primary',
    'primary_typepartial' = 'Partially Open Primary',
    'primary_typeother' = 'Other Primary Type',

    'gov_dempct' = 'Governor Democratic Vote Share',
    'election' = 'Election Year',
    'running' = 'Governor Running for Re-Election',
    'unif' = 'Unified State Government'
    )

modelsummary(list(int_pr_ec),
    coef_map = c_map,
    stars = TRUE,
    gof_omit = 'AIC|BIC|FE|Std.',
    output = 'latex_tabular', 
    escape = FALSE)

# ---- Table B6 + Figure 6: Approval Regression and Marginal Effects (Main Text / Appendix B.2) ----
# Data from CES 2006-2020. Produces:
#   - Table A11: Approval regression results
#   - Figure 6 (me_app_plot.pdf): Marginal effects of partisan slant on approval

# Sub-section 1: CES data prep
cc <- read_dta("cumulative_2006-2024.dta")
ccs <- cc %>% 
    mutate(gov_app = case_when(approval_gov %in% 1:2 ~ 1,
        approval_gov == 6 ~ 0, 
        approval_gov %in% 3:4 ~ -1,
        TRUE ~ NA_real_),
        state = as.character(as_factor(state)),
        pidr = case_when(pid7 %in% 1:3 ~ 'D',
            pid7 == 4 ~ 'I',
            pid7 %in% 5:7 ~ 'R',
        TRUE ~ NA_character_)) %>% 
    select(year, case_id, state, pidr, gov_app)

# Sub-section 2: Merge speech data with CES respondent-level approval
gov_agg <- gov_covar_cw %>%
    group_by(state, year, party, election, running, unif, gingrich, rep20, govlast) %>%
    summarise(avg_r_slant = mean(avg_probability_class_1)) %>% 
    filter(year %in% 2006:2020) 

gov_agg$avg_r_scale <- scale(gov_agg$avg_r_slant)[,1]

app_df <- left_join(gov_agg, ccs) %>% 
    left_join(cw) %>% 
    ungroup()

# Sub-section 3: Approval model and Table A11
gov_mod_int <- feols(gov_app ~ avg_r_scale * pidr + policy_updated * pidr + party + rep20 + running + election + unif | govlast + state + year, app_df, cluster = ~case_id + govlast)
app_me_speech <- slopes(gov_mod_int, variables = 'avg_r_scale', newdata = datagrid(pidr = c('D', 'R')))
app_me_policy <- slopes(gov_mod_int, variables = 'policy_updated', newdata = datagrid(pidr = c('D', 'R')))

gov_mod_int3 <- feols(gov_app ~ avg_r_scale * policy_updated * pidr + party + rep20 + running + election + unif | govlast + state + year, app_df, cluster = ~case_id + govlast)
slopes(gov_mod_int3, variables = 'avg_r_scale', newdata = datagrid(pidr = c('D', 'R'), policy_updated = c(-1.99, 1.06)))

app_df %>%
  summarise(across(c(gov_app, avg_r_scale, pidr, policy_updated, party, rep20, running, election,
    unif, govlast, year),
    ~sum(is.na(.)))) %>% glimpse

c_map_app <- c(
    'avg_r_scale' = 'Partisan Slant (R)',
    'pidrR' = 'Republican Respondent',
    'avg_r_scale:pidrR' = 'Partisan Slant (R) x Republican Respondent',
    'policy_updated' = 'Policy Conservatism',
    'pidrR:policy_updated' = 'Policy Conservatism x Republican Respondent',
    'partyR' = 'Republican Gov.',
    'rep20' = 'Two-Party Republican Vote Share (20s)',
    'pidrI' = 'Independent Respondent',
    'avg_r_scale:pidrI' = 'Partisan Slant (R) x Independent Respondent',
    'pidrI:policy_updated' = 'Policy Conservatism x Independent Respondent',
    'election' = 'Election Year',
    'running' = 'Governor Running for Re-Election',
    'unif' = 'Unified State Government'
    )

modelsummary(list(gov_mod_int),
    coef_map = c_map_app,
    stars = TRUE,
    gof_omit = 'AIC|BIC|FE|Std.',
    output = 'latex_tabular', 
    escape = FALSE)

# Sub-section 4: Figure 6 — marginal effects plot
mes_app <- bind_rows(app_me_speech, app_me_policy) %>%
    mutate(var2 = if_else(term == 'avg_r_scale', 'Rep. Partisan Slant (SOTS)', 'Policy Conservatism'),
        party2 = if_else(pidr == 'R', 'Republican', 'Democrat'))

me_app_plot <- ggplot(mes_app, aes(x = party2, y = estimate, shape = var2)) + 
    geom_point(position = position_dodge(0.3), size = 2) +
    geom_errorbar(aes(ymin = conf.low, ymax = conf.high), width = 0, position = position_dodge(0.3)) +
    geom_hline(yintercept = 0, linetype = 'dashed') + 
    theme_baselike() +
    labs(x = 'Respondent Party', y = 'Marginal Effect on Governor Approval', shape = '') 

ggsave('me_app_plot.png', me_app_plot, height = 4, width = 8)

# ---- Table B7: Replication with Fine-Tuning by Political Realignment Eras (Appendix B.2) ----
sots_eras <- read_csv('sots_eras.csv')

eras_covars <- gov_covar_cw %>%
    left_join(sots_eras %>%
        select(speech_id, eras_pr = avg_probability_class_1)) %>%
        mutate(eras_pr_scale = scale(eras_pr)[,1])

eras_mod <- feols(eras_pr_scale ~ (party +  rep20) * gingrich + running + election + unif | state + year, eras_covars, cluster = ~state)

modelsummary(list(eras_mod),
    coef_map = c_map,
    stars = TRUE,
    gof_omit = 'AIC|BIC|FE|Std.',
    output = 'latex_tabular', 
    escape = FALSE)
